1. Setup

2. R Basics

Load R Packages

library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)
library(ipumsr)
library(sf)
library(terra)
library(ggspatial)
library(lubridate)

3. Tabular DHS Data

Loading DHS survey data

Note: You will need to update this file name to match your DHS extract number!

# Load the metadata file
ke_ddi <- read_ipums_ddi("data/idhs_00023.xml") # Use your own file name here 

# Use metadata to load the data correctly
ke_dhs <- read_ipums_micro(ke_ddi)
## Use of data from IPUMS DHS is subject to conditions including that users should cite the data appropriately. Use command `ipums_conditions()` for more details.
ke_dhs
## # A tibble: 20,964 × 53
##    SAMPLE             SAMPLESTR    COUNTRY    YEAR IDHSPID IDHSHID DHSID IDHSPSU
##    <int+lbl>          <chr+lbl>    <int+lbl> <int> <chr>   <chr>   <chr>   <dbl>
##  1 40406 [Kenya 2014] 40406 [Keny… 404 [Ken…  2014 40406 … 40406 … KE20… 4.04e10
##  2 40406 [Kenya 2014] 40406 [Keny… 404 [Ken…  2014 40406 … 40406 … KE20… 4.04e10
##  3 40406 [Kenya 2014] 40406 [Keny… 404 [Ken…  2014 40406 … 40406 … KE20… 4.04e10
##  4 40406 [Kenya 2014] 40406 [Keny… 404 [Ken…  2014 40406 … 40406 … KE20… 4.04e10
##  5 40406 [Kenya 2014] 40406 [Keny… 404 [Ken…  2014 40406 … 40406 … KE20… 4.04e10
##  6 40406 [Kenya 2014] 40406 [Keny… 404 [Ken…  2014 40406 … 40406 … KE20… 4.04e10
##  7 40406 [Kenya 2014] 40406 [Keny… 404 [Ken…  2014 40406 … 40406 … KE20… 4.04e10
##  8 40406 [Kenya 2014] 40406 [Keny… 404 [Ken…  2014 40406 … 40406 … KE20… 4.04e10
##  9 40406 [Kenya 2014] 40406 [Keny… 404 [Ken…  2014 40406 … 40406 … KE20… 4.04e10
## 10 40406 [Kenya 2014] 40406 [Keny… 404 [Ken…  2014 40406 … 40406 … KE20… 4.04e10
## # ℹ 20,954 more rows
## # ℹ 45 more variables: IDHSSTRATA <dbl>, CASEID <chr>, HHID <chr>, PSU <dbl>,
## #   STRATA <dbl>, DOMAIN <dbl>, HHNUM <dbl>, CLUSTERNO <dbl>, LINENO <int>,
## #   BIDX <int>, PERWEIGHT <dbl>, KIDWT <dbl>, AWFACTT <dbl>, AWFACTU <dbl>,
## #   AWFACTR <dbl>, AWFACTE <dbl>, AWFACTW <dbl>, DVWEIGHT <dbl>,
## #   URBAN <int+lbl>, GEO_KE1989_2014 <int+lbl>, GEO_KE2014 <int+lbl>,
## #   GEOALT_KE2014 <int+lbl>, AGE <int>, AGE5YEAR <int+lbl>, …

Exploring our DHS data

colnames(ke_dhs)
##  [1] "SAMPLE"          "SAMPLESTR"       "COUNTRY"         "YEAR"           
##  [5] "IDHSPID"         "IDHSHID"         "DHSID"           "IDHSPSU"        
##  [9] "IDHSSTRATA"      "CASEID"          "HHID"            "PSU"            
## [13] "STRATA"          "DOMAIN"          "HHNUM"           "CLUSTERNO"      
## [17] "LINENO"          "BIDX"            "PERWEIGHT"       "KIDWT"          
## [21] "AWFACTT"         "AWFACTU"         "AWFACTR"         "AWFACTE"        
## [25] "AWFACTW"         "DVWEIGHT"        "URBAN"           "GEO_KE1989_2014"
## [29] "GEO_KE2014"      "GEOALT_KE2014"   "AGE"             "AGE5YEAR"       
## [33] "RESIDENT"        "RELIGION"        "MARSTAT"         "CHEB"           
## [37] "FLOOR"           "TOILETTYPE"      "DRINKWTR"        "CURRWORK"       
## [41] "WEALTHQ"         "WEALTHS"         "EDUCLVL"         "EDYRTOTAL"      
## [45] "HEIGHTFEM"       "KIDSEX"          "KIDDOBCMC"       "LINENOKID"      
## [49] "HWHAZWHO"        "BIRTHWT"         "BIRTHWTREF"      "FEVRECENT"      
## [53] "DIARRECENT"
ke_dhs[98, c(1, 2, 3)]
## # A tibble: 1 × 3
##   SAMPLE             SAMPLESTR          COUNTRY    
##   <int+lbl>          <chr+lbl>          <int+lbl>  
## 1 40406 [Kenya 2014] 40406 [Kenya 2014] 404 [Kenya]
ke_dhs[, "DHSID"]
## # A tibble: 20,964 × 1
##    DHSID         
##    <chr>         
##  1 KE201400000001
##  2 KE201400000001
##  3 KE201400000001
##  4 KE201400000001
##  5 KE201400000001
##  6 KE201400000001
##  7 KE201400000001
##  8 KE201400000001
##  9 KE201400000001
## 10 KE201400000001
## # ℹ 20,954 more rows
ke_dhs$RELIGION
## <labelled<integer>[20964]>: Religion
##     [1] 2300 2300 2300 2300 2300 2300 2300 2300 2300 2300 2300 2300 2300 2300
##    [15] 2300 2300 2300 2300 2300 2300 2300 2300 2300 2300 2300 2300 2300 2300
##    [29] 2300 2100 2100 2300 2300 2300 2300 2300 2300 2300 2100 2300 2300 2300
##    [43] 2100 2300 2300 2100 2100 2300 2300 2300 2300 2300 2300 2100 2300 2300
....

Exploring DHS metadata

# Display variable information for all variables in our file
ipums_var_info(ke_dhs)
## # A tibble: 53 × 4
##    var_name   var_label                                      var_desc val_labels
##    <chr>      <chr>                                          <chr>    <list>    
##  1 SAMPLE     IPUMS-DHS sample identifier                    "SAMPLE… <tibble>  
##  2 SAMPLESTR  IPUMS-DHS sample identifier (string)           "SAMPLE… <tibble>  
##  3 COUNTRY    Country                                        "COUNTR… <tibble>  
##  4 YEAR       Year of sample                                 "YEAR r… <tibble>  
##  5 IDHSPID    Unique cross-sample respondent identifier      "IDHSPI… <tibble>  
##  6 IDHSHID    Unique cross-sample household identifier       "IDHSHI… <tibble>  
##  7 DHSID      Key to link DHS clusters to context data (str… "DHSID … <tibble>  
##  8 IDHSPSU    Unique sample-case PSU identifier              "IDHSPS… <tibble>  
##  9 IDHSSTRATA Unique cross-sample sampling strata            "IDHSST… <tibble>  
## 10 CASEID     Sample-specific respondent identifier          "CASEID… <tibble>  
## # ℹ 43 more rows
ipums_var_desc(ke_dhs$BIRTHWT)
## [1] "For children born in the three to five years before the survey, BIRTHWT (M19) reports the child's birthweight in kilos with three implied decimal places (or, alternatively stated, in grams with no decimal places). Children who were not weighed are coded 9996."
ipums_val_labels(ke_dhs$RELIGION)
## # A tibble: 64 × 2
##      val lbl                          
##    <int> <chr>                        
##  1     0 NO RELIGION                  
##  2  1000 MUSLIM                       
##  3  2000 CHRISTIAN                    
##  4  2100 Catholic                     
##  5  2200 Orthodox                     
##  6  2300 Protestant                   
##  7  2310 Lutheran                     
##  8  2320 Anglican                     
##  9  2330 Presbyterian                 
## 10  2340 Baptist/Seventh-day Adventist
## # ℹ 54 more rows
ipums_val_labels(ke_dhs$BIRTHWT)
## # A tibble: 5 × 2
##     val lbl                  
##   <int> <chr>                
## 1  9995 9995+                
## 2  9996 Not weighed at birth 
## 3  9997 Don't know           
## 4  9998 Missing              
## 5  9999 NIU (not in universe)

Cleaning DHS data

ke_dhs$BIRTHWT[1:5]
## <labelled<integer>[5]>: Birthweight in kilos
## [1] 9999 9999 3200 3500 9999
## 
## Labels:
##  value                 label
##   9995                 9995+
##   9996  Not weighed at birth
##   9997            Don't know
##   9998               Missing
##   9999 NIU (not in universe)
# For values greater than 9995, convert to NA (missing)
# Otherwise, divide by 1000 to convert decimal places appropriately
birthwt_clean <- if_else(
  ke_dhs$BIRTHWT > 9995, # Condition
  NA,                    # Replacement if TRUE
  ke_dhs$BIRTHWT / 1000  # Replacement if FALSE
)

birthwt_clean[1:5]
## [1]  NA  NA 3.2 3.5  NA
ke_dhs <- ke_dhs |>
  mutate(BIRTHWT = if_else(BIRTHWT > 9995, NA, BIRTHWT / 1000))

ke_dhs$BIRTHWT[1:5]
## [1]  NA  NA 3.2 3.5  NA
ke_dhs <- ke_dhs |>
  mutate(
    HEIGHTFEM = if_else(HEIGHTFEM >= 9994, NA, HEIGHTFEM / 10),
    BIRTHWTREF = if_else(BIRTHWTREF >= 7, NA, BIRTHWTREF)
  )
ke_dhs$EDUCLVL[1:5]
## <labelled<integer>[5]>: Highest educational level
## [1] 2 2 2 2 1
## 
## Labels:
##  value        label
##      0 No education
##      1      Primary
##      2    Secondary
##      3       Higher
##      8      Missing
case_when(
  ke_dhs$EDUCLVL == 8 ~ NA, 
  ke_dhs$EDUCLVL >= 2 ~ "Secondary+",
  ke_dhs$EDUCLVL == 1 ~ "Primary",
  TRUE ~ "None"
)
##     [1] "Secondary+" "Secondary+" "Secondary+" "Secondary+" "Primary"   
##     [6] "Primary"    "Primary"    "Secondary+" "Secondary+" "Secondary+"
##    [11] "Secondary+" "Secondary+" "Secondary+" "Primary"    "Primary"   
##    [16] "Secondary+" "Secondary+" "Secondary+" "Secondary+" "Primary"   
##    [21] "Primary"    "Primary"    "Primary"    "Secondary+" "Secondary+"
....
ke_dhs <- ke_dhs |>
  mutate(
    EDUCLVL = case_when(
      EDUCLVL == 8 ~ NA,
      EDUCLVL >= 2 ~ "Secondary+",
      EDUCLVL == 1 ~ "Primary",
      TRUE ~ "None"
    )
  )

Note: this is not displayed in slides but should be run in demo…demo code to be included in this file?

ke_dhs <- ke_dhs |> 
  mutate(
    FLOOR = case_when(
      FLOOR >= 996 ~ NA,
      FLOOR >= 300 ~ "Finished",
      TRUE ~ "Unfinished"
    ),
    TOILETTYPE = case_when(
      TOILETTYPE == 0 ~ "No facility",
      TOILETTYPE >= 9996 ~ NA,
      TOILETTYPE >= 2000 ~ "Non-flush",
      TRUE ~ "Flush"
    ),
    DRINKWTR = case_when(
      DRINKWTR >= 9996 ~ NA,
      DRINKWTR >= 4000 ~ "Other",
      DRINKWTR >= 3000 ~ "Surface",
      DRINKWTR >= 2000 ~ "Well",
      TRUE ~ "Piped"
    ),
    FEVRECENT = case_when(
      FEVRECENT >= 97 ~ NA,
      FEVRECENT >= 20 ~ "Yes",
      TRUE ~ "No"
    ),
    DIARRECENT = case_when(
      DIARRECENT >= 97 ~ NA,
      DIARRECENT >= 20 ~ "Yes",
      TRUE ~ "No"
    )
  )

Defining our sample

# str_squish() removes excess whitespace in the ID strings
ke_dhs <- ke_dhs |>
  mutate(KIDID = str_squish(paste(IDHSPID, BIDX)))

ke_dhs$KIDID
##     [1] "40406 0001019 02 1" "40406 0001019 03 1" "40406 0001033 02 1"
##     [4] "40406 0001033 02 2" "40406 0001037 02 1" "40406 0001037 02 2"
##     [7] "40406 0001041 02 1" "40406 0001059 02 1" "40406 0001059 02 2"
##    [10] "40406 0001059 02 3" "40406 0001063 02 1" "40406 0001072 02 1"
##    [13] "40406 0001099 02 1" "40406 0002006 01 1" "40406 0002006 02 1"
....
ke_dhs <- ke_dhs |>
  filter(RESIDENT == 1)

ke_dhs$RESIDENT
## <labelled<integer>[20477]>: Usual resident or visitor
##     [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##    [37] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##    [73] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##   [109] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
....
ke_dhs <- ke_dhs |>
  filter(!is.na(BIRTHWT))

Visualizing our sample

ggplot(ke_dhs) +
  geom_histogram(aes(x = BIRTHWT), bins = 70)

Data types

ke_dhs <- as_factor(ke_dhs) # Convert all labeled columns to factors

ke_dhs$RESIDENT
##    [1] Usual resident Usual resident Usual resident Usual resident
##    [5] Usual resident Usual resident Usual resident Usual resident
##    [9] Usual resident Usual resident Usual resident Usual resident
##   [13] Usual resident Usual resident Usual resident Usual resident
##   [17] Usual resident Usual resident Usual resident Usual resident
....

4. DHS GPS Data

Loading GPS data in R

# Reminder: these are fake coordinates!
ke_gps <- st_read("data/ke_gps.shp", quiet = TRUE)

ke_gps
## Simple feature collection with 1594 features and 20 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 33.98758 ymin: -4.586276 xmax: 41.87459 ymax: 4.611987
## Geodetic CRS:  WGS 84
## First 10 features:
##             DHSID DHSCC DHSYEAR DHSCLUST CCFIPS ADM1FIPS ADM1FIPSNA ADM1SALBNA
## 1  KE201400000001    KE    2014        1     KE     NULL       NULL       NULL
## 2  KE201400000002    KE    2014        2     KE     NULL       NULL       NULL
## 3  KE201400000003    KE    2014        3     KE     NULL       NULL       NULL
## 4  KE201400000004    KE    2014        4     KE     NULL       NULL       NULL
## 5  KE201400000005    KE    2014        5     KE     NULL       NULL       NULL
## 6  KE201400000006    KE    2014        6     KE     NULL       NULL       NULL
## 7  KE201400000007    KE    2014        7     KE     NULL       NULL       NULL
## 8  KE201400000008    KE    2014        8     KE     NULL       NULL       NULL
## 9  KE201400000009    KE    2014        9     KE     NULL       NULL       NULL
## 10 KE201400000010    KE    2014       10     KE     NULL       NULL       NULL
##    ADM1SALBCO ADM1DHS ADM1NAME DHSREGCO DHSREGNA SOURCE URBAN_RURA    LATNUM
## 1        NULL      90  Nairobi        9  Nairobi    CEN          U -1.282723
## 2        NULL      90  Nairobi        9  Nairobi    CEN          U -1.278781
## 3        NULL      90  Nairobi        9  Nairobi    CEN          U -1.279646
## 4        NULL      90  Nairobi        9  Nairobi    CEN          U -1.280380
## 5        NULL      90  Nairobi        9  Nairobi    CEN          U -1.272064
## 6        NULL      90  Nairobi        9  Nairobi    CEN          U -1.263006
## 7        NULL      90  Nairobi        9  Nairobi    CEN          U -1.341546
## 8        NULL      90  Nairobi        9  Nairobi    CEN          U -1.312427
## 9        NULL      90  Nairobi        9  Nairobi    CEN          U -1.318072
## 10       NULL      90  Nairobi        9  Nairobi    CEN          U -1.315103
##     LONGNUM ALT_GPS ALT_DEM DATUM                   geometry
## 1  36.75296    9999    1801 WGS84  POINT (35.71713 0.140332)
## 2  36.75844    9999    1801 WGS84 POINT (35.76638 0.6270388)
## 3  36.74593    9999    1808 WGS84 POINT (36.11351 0.4334558)
## 4  36.69709    9999    1894 WGS84 POINT (35.87447 0.4794212)
## 5  36.74313    9999    1816 WGS84 POINT (36.32321 0.1759624)
## 6  36.71059    9999    1920 WGS84 POINT (35.61286 0.5304094)
## 7  36.69719    9999    1887 WGS84  POINT (35.98676 1.239719)
## 8  36.78541    9999    1702 WGS84  POINT (36.05537 0.538201)
## 9  36.78468    9999    1710 WGS84  POINT (36.10384 1.115597)
## 10 36.80952    9999    1700 WGS84 POINT (36.07837 0.2047607)

Mapping

ggplot() +
  layer_spatial(ke_gps) +
  theme_minimal()

ke_borders <- st_read(
  "data/ke_boundaries/sdr_subnational_boundaries2.shp", 
  quiet = TRUE
)
theme_set(theme_minimal())

ggplot() +
  layer_spatial(ke_borders) +
  layer_spatial(ke_gps)

ggplot() +
  layer_spatial(
    ke_borders,
    fill = "#F6E6CB",  # Polygon fill color
    color = "#7f7f7f", # Polygon outline color
    linewidth = 0.5    # Polygon line width
  ) +
  layer_spatial(
    ke_gps,
    alpha = 0.2,       # Cluster point transparency
    color = "#444444"  # Cluster point color
  ) +
  labs(title = "DHS Cluster Locations", subtitle = "Note: falsified coordinates")

5. Climate Data

Obtaining CHIRTS data

library(chirps)

ke_chirts <- get_chirts(
  vect(ke_borders),                      # Spatial range
  dates = c("2013-01-01", "2013-12-31"), # Date range
  var = "Tmax"                           # Desired temperature variable
)

Raster data in R

ke_chirts <- rast("data/ke_chirts_2013.nc") # Our pre-saved CHIRTS NetCDF

ke_chirts
## class       : SpatRaster 
## dimensions  : 198, 163, 365  (nrow, ncol, nlyr)
## resolution  : 0.05, 0.05  (x, y)
## extent      : 33.85, 42, -4.750001, 5.149999  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source      : ke_chirts_2013.nc 
## names       : ke_ch~013_1, ke_ch~013_2, ke_ch~013_3, ke_ch~013_4, ke_ch~013_5, ke_ch~013_6, ... 
## time (days) : 2013-01-01 to 2013-12-31
ke_chirts[[1]]
## class       : SpatRaster 
## dimensions  : 198, 163, 1  (nrow, ncol, nlyr)
## resolution  : 0.05, 0.05  (x, y)
## extent      : 33.85, 42, -4.750001, 5.149999  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source      : ke_chirts_2013.nc 
## name        : ke_chirts_2013_1 
## time (days) : 2013-01-01
# Get raster cell values (in this case, max temperature for that cell)
values(ke_chirts)
##          ke_chirts_2013_1 ke_chirts_2013_2 ke_chirts_2013_3 ke_chirts_2013_4
##     [1,]         36.04524         37.71585         36.70155         38.62380
##     [2,]         35.98721         37.81504         36.77429         38.64823
##     [3,]         35.83154         37.79731         36.73273         38.56688
##     [4,]         35.45006         37.50497         36.42466         38.23553
....
# Number of layers
nlyr(ke_chirts)
## [1] 365
# Date that corresponds to each layer
time(ke_chirts)
##   [1] "2013-01-01" "2013-01-02" "2013-01-03" "2013-01-04" "2013-01-05"
##   [6] "2013-01-06" "2013-01-07" "2013-01-08" "2013-01-09" "2013-01-10"
##  [11] "2013-01-11" "2013-01-12" "2013-01-13" "2013-01-14" "2013-01-15"
##  [16] "2013-01-16" "2013-01-17" "2013-01-18" "2013-01-19" "2013-01-20"
##  [21] "2013-01-21" "2013-01-22" "2013-01-23" "2013-01-24" "2013-01-25"
....

Mapping with raster data

# Reclassify missing values
m <- matrix(c(-9999, NA), nrow = 1)
ke_chirts <- classify(ke_chirts, m)

ggplot() +
  layer_spatial(ke_chirts[[1]], alpha = 0.9) +
  layer_spatial(ke_borders, fill = NA, color = "#4c4c4c", linewidth = 0.5)

6. Working with Space and Time

6.1 Transforming Vector Data

Projecting

ke_gps[, "geometry"]
## Simple feature collection with 1594 features and 0 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 33.98758 ymin: -4.586276 xmax: 41.87459 ymax: 4.611987
## Geodetic CRS:  WGS 84
## First 10 features:
##                      geometry
## 1   POINT (35.71713 0.140332)
## 2  POINT (35.76638 0.6270388)
## 3  POINT (36.11351 0.4334558)
## 4  POINT (35.87447 0.4794212)
## 5  POINT (36.32321 0.1759624)
## 6  POINT (35.61286 0.5304094)
## 7   POINT (35.98676 1.239719)
## 8   POINT (36.05537 0.538201)
## 9   POINT (36.10384 1.115597)
## 10 POINT (36.07837 0.2047607)

Projection in R

ke_gps <- st_transform(ke_gps, crs = 32637)

ke_gps[, "geometry"]
## Simple feature collection with 1594 features and 0 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -58474.75 ymin: -506942.7 xmax: 819246.2 ymax: 510915
## Projected CRS: WGS 84 / UTM zone 37N
## First 10 features:
##                     geometry
## 1  POINT (134498.7 15536.57)
## 2  POINT (140008.4 69417.92)
## 3  POINT (178677.7 47971.17)
## 4  POINT (152044.8 53069.94)
## 5    POINT (202032 19470.52)
## 6  POINT (122890.9 58729.47)
## 7  POINT (164623.8 137217.3)
## 8  POINT (172205.7 59566.57)
## 9    POINT (177653 123465.8)
## 10 POINT (174755.5 22661.86)
ggplot() +
  layer_spatial(ke_gps)

Creating Buffers

ke_buffer <- st_buffer(ke_gps, dist = 10000)

ke_buffer[, "geometry"]
## Simple feature collection with 1594 features and 0 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -68474.75 ymin: -516942.7 xmax: 829246.2 ymax: 520915
## Projected CRS: WGS 84 / UTM zone 37N
## First 10 features:
##                          geometry
## 1  POLYGON ((144498.7 15536.57...
## 2  POLYGON ((150008.4 69417.92...
## 3  POLYGON ((188677.7 47971.17...
## 4  POLYGON ((162044.8 53069.94...
## 5  POLYGON ((212032 19470.52, ...
## 6  POLYGON ((132890.9 58729.47...
## 7  POLYGON ((174623.8 137217.3...
## 8  POLYGON ((182205.7 59566.57...
## 9  POLYGON ((187653 123465.8, ...
## 10 POLYGON ((184755.5 22661.86...
ggplot() +
  layer_spatial(ke_buffer, alpha = 0)

Transform Back

ke_buffer <- st_transform(ke_buffer, crs = 4326) # WGS 84 CRS

6.2 Transforming Raster Data

Average temperature

ke_chirts_mean <- mean(ke_chirts)

ke_chirts_mean
## class       : SpatRaster 
## dimensions  : 198, 163, 1  (nrow, ncol, nlyr)
## resolution  : 0.05, 0.05  (x, y)
## extent      : 33.85, 42, -4.750001, 5.149999  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## name        :     mean 
## min value   : 10.84893 
## max value   : 41.83465
theme_set(theme_dhs_base())

ggplot() +
  layer_spatial(
    mask(ke_chirts_mean, ke_borders),
    alpha = 0.9
  ) +
  layer_spatial(ke_borders, fill = NA, color = "#4c4c4c", linewidth = 0.5) +
  labs(title = "Mean Temperature (°C)", subtitle = "Kenya, 2013", fill = "") +
  scale_fill_chirts_c()
## Warning: Removed 13057 rows containing missing values or values outside the scale range
## (`geom_raster()`).

Monthly aggregation

# Average daily CHIRTS within months
ke_chirts_mean_mnths <- tapp(
  ke_chirts,
  fun = mean,
  index = "yearmonths"
)

ke_chirts_mean_mnths
## class       : SpatRaster 
## dimensions  : 198, 163, 12  (nrow, ncol, nlyr)
## resolution  : 0.05, 0.05  (x, y)
## extent      : 33.85, 42, -4.750001, 5.149999  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## names       : ym_201301, ym_201302, ym_201303, ym_201304, ym_201305, ym_201306, ... 
## min values  :  13.05012,  14.09646,  12.47642,  11.19747,  11.40163,  8.428434, ... 
## max values  :  42.44487,  43.45883,  43.19976,  40.33416,  41.96343, 40.689007, ... 
## time (ymnts): 2013-Jan to 2013-Dec
months <- c("January", "February", "March", "April",
            "May", "June", "July", "August",
            "September", "October", "November", "December")

panels <- purrr::map2(
  split_raster(ke_chirts_mean_mnths),
  months,
  function(r, m) {
    chirts_panel_continuous(
      r,
      borders = ke_borders,
      panel_title = m,
      show_scale = FALSE,
      fill_lab = "",
      limits = c(7, 44)
    )
  }
)

patchwork::wrap_plots(panels) +
  patchwork::plot_layout(guides = "collect", ncol = 6) +
  patchwork::plot_annotation(
    title = "Average Monthly Temperature (°C)",
    subtitle = "Kenya, 2013",
    caption = "Source: Climate Hazards Center"
  )

Temperature deviation

# List all 12 monthly average files
chirts_mnth_means <- list.files("data/chirts", full.names = TRUE)

# Load monthly averages into a single raster stack
chirts_norm <- rast(chirts_mnth_means)
# Subtract 2013 monthly average from 10-year monthly average
chirts_dev <- ke_chirts_mean_mnths - chirts_norm

chirts_dev
## class       : SpatRaster 
## dimensions  : 198, 163, 12  (nrow, ncol, nlyr)
## resolution  : 0.05, 0.05  (x, y)
## extent      : 33.85, 42, -4.750001, 5.149999  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## names       :  ym_201301,  ym_201302,  ym_201303,   ym_201304, ym_201305,  ym_201306, ... 
## min values  : -0.7652607, -0.5505671, -1.6592527, -1.94416453, -1.233843, -1.9243449, ... 
## max values  :  1.2226097,  0.9732178,  0.9714775,  0.09900742,  1.311446,  0.8205259, ... 
## time (ymnts): 2013-Jan to 2013-Dec
ggplot() +
  layer_spatial(chirts_dev[[5]]) +
  scale_fill_gradient2(
    low = "#438b81",
    mid = "#f0ebd7",
    high = "#c25d33",
    limits = c(-2, 2),
    na.value = "transparent"
  ) +
  labs(
    title = "Difference from 10-year average temperature (°C)",
    subtitle = "May, 2013",
    fill = ""
  )

panels <- purrr::map2(
  split_raster(chirts_dev),
  months,
  function(r, m) {
    chirts_panel_diverging(
      r,
      borders = ke_borders,
      panel_title = m,
      show_scale = FALSE,
      fill_lab = "",
      low = "#438b81", mid = "#f0ebd7", high = "#c25d33",
      limits = c(-2, 2),
      na.value = "transparent"
    )
  }
)

patchwork::wrap_plots(panels) +
  patchwork::plot_layout(guides = "collect", ncol = 6) +
  patchwork::plot_annotation(
    title = "Deviation from 10-Year Monthly Average (°C)",
    subtitle = "Kenya, 2013",
    caption = "Source: Climate Hazards Center"
  )

Spatial aggregation

ke_mean_to_plot <- mask(ke_chirts_mean_mnths[[1]], ke_borders)

ggplot() +
  layer_spatial(ke_mean_to_plot, alpha = 0.9) +
  layer_spatial(ke_borders, fill = NA, color = "#444444") +
  layer_spatial(ke_buffer, fill = NA, color = alpha("black", 0.5)) +
  labs(title = "Mean Temperature (°C)", subtitle = "January, 2013", fill = "") +
  scale_fill_chirts_c()
## Warning: Removed 13057 rows containing missing values or values outside the scale range
## (`geom_raster()`).

Zooming in

# Select a single cluster to demonstrate
clust1 <- ke_buffer |>
  filter(DHSID == "KE201400001487")
ggplot() +
  layer_spatial(crop(ke_chirts_mean_mnths[[1]], clust1, snap = "out")) +
  layer_spatial(clust1, fill = NA, color = "black") +
  labs(title = "Mean Temperature (°C)", subtitle = "Single Cluster", fill = "") +
  scale_fill_chirts_c()

Spatial aggregation

extract(ke_chirts_mean_mnths[[1]], clust1)
##    ID ym_201301
## 1   1  35.12662
## 2   1  34.01415
## 3   1  32.42788
## 4   1  35.06354
## 5   1  33.72320
## 6   1  32.87634
## 7   1  34.81424
## 8   1  33.69226
## 9   1  33.22820
## 10  1  33.79419
extract(
  ke_chirts_mean_mnths[[1]],
  clust1,
  weights = TRUE
)
##    ID ym_201301 weight
## 1   1  35.65890   0.02
## 2   1  35.12662   0.69
## 3   1  34.01415   1.00
## 4   1  32.42788   0.69
## 5   1  33.32614   0.02
## 6   1  35.59715   0.27
## 7   1  35.06354   1.00
## 8   1  33.72320   1.00
## 9   1  32.87634   1.00
## 10  1  33.62588   0.27
## 11  1  35.55475   0.14
## 12  1  34.81424   0.98
## 13  1  33.69226   1.00
## 14  1  33.22820   0.98
## 15  1  33.50158   0.14
## 16  1  34.91669   0.24
## 17  1  33.79419   0.54
## 18  1  33.35481   0.24
extract(
  ke_chirts_mean_mnths[[1]],
  clust1,
  weights = TRUE,
  fun = mean
)
##   ID ym_201301
## 1  1  33.95651

Scaling up

ke_chirts_clust <- extract(
  ke_chirts_mean_mnths, # Extract values for each month of temperature data
  ke_buffer,            # Use all cluster polygons
  weights = TRUE,
  fun = mean
)
# Update to use cluster IDs
ke_chirts_clust$ID <- ke_buffer$DHSID

ke_chirts_clust
##                  ID ym_201301 ym_201302 ym_201303 ym_201304 ym_201305 ym_201306
## 1    KE201400000001  25.50097  26.93703  26.74054  23.99621  24.73310  23.18324
## 2    KE201400000002  27.44769  28.71789  28.42848  25.70832  26.39564  25.03732
## 3    KE201400000003  31.52870  32.83890  32.51355  29.73158  30.76798  29.27555
## 4    KE201400000004  30.59242  32.00248  31.70173  28.82798  29.88593  28.54942
....

7. Linking climate exposure to survey data

7.1 Data source formats

Wide vs. long format

ke_chirts_long <- ke_chirts_clust |>
  pivot_longer(
    cols = -ID, # Do not pivot the ID col
    names_to = "CHIRTS_DATE", # Rename output columns
    values_to = "MEAN_TEMP"
  )

ke_chirts_long
## # A tibble: 19,128 × 3
##    ID             CHIRTS_DATE MEAN_TEMP
##    <chr>          <chr>           <dbl>
##  1 KE201400000001 ym_201301        25.5
##  2 KE201400000001 ym_201302        26.9
##  3 KE201400000001 ym_201303        26.7
##  4 KE201400000001 ym_201304        24.0
##  5 KE201400000001 ym_201305        24.7
##  6 KE201400000001 ym_201306        23.2
##  7 KE201400000001 ym_201307        23.4
##  8 KE201400000001 ym_201308        23.0
##  9 KE201400000001 ym_201309        24.5
## 10 KE201400000001 ym_201310        25.1
## # ℹ 19,118 more rows

Dates in DHS

ke_dhs$KIDDOBCMC
##    [1] 1364 1331 1356 1333 1348 1352 1321 1331 1351 1376 1333 1354 1318 1372
##   [15] 1333 1318 1367 1363 1359 1341 1318 1347 1371 1351 1320 1317 1337 1363
##   [29] 1330 1364 1373 1356 1335 1349 1320 1332 1330 1348 1366 1341 1346 1324
##   [43] 1334 1327 1334 1316 1321 1364 1366 1352 1354 1363 1369 1351 1351 1362
##   [57] 1331 1331 1345 1327 1344 1327 1340 1322 1344 1319 1315 1372 1363 1326
....

Dates in CHIRTS

ke_chirts_long$CHIRTS_DATE
##     [1] "ym_201301" "ym_201302" "ym_201303" "ym_201304" "ym_201305" "ym_201306"
##     [7] "ym_201307" "ym_201308" "ym_201309" "ym_201310" "ym_201311" "ym_201312"
##    [13] "ym_201301" "ym_201302" "ym_201303" "ym_201304" "ym_201305" "ym_201306"
##    [19] "ym_201307" "ym_201308" "ym_201309" "ym_201310" "ym_201311" "ym_201312"
##    [25] "ym_201301" "ym_201302" "ym_201303" "ym_201304" "ym_201305" "ym_201306"
....

Converting Dates

ipums_var_desc(ke_ddi, "KIDDOBCMC")
## [1] "KIDDOBCMC (B3) reports the century month code for the date of birth of the child. \n\nCentury month codes (CMC) are particularly useful for checking the consistency of dates, calculating intervals between events, and imputing dates when the information for an event is missing or partially complete.\n\nCentury month codes (CMC) are calculated by multiplying by 12 the difference between the year of an event and 1900. The year 1900 was chosen as the reference period because all of the DHS-relevant events occurred during the twentieth or twenty-first centuries. The month of the event is added to the previous result.\nCMC = (Year minus 1900) * 12 + MonthFor example, the CMC for June 2002 is:\nCMC = (2002 minus 1900) * 12 + 6 = 1230In other words, 1,230 months have elapsed between January 1900 and June 2002. Starting with CMC figures, one can calculate the month and year using the following formulas:\nYear = int( ( CMC minus 1 )/12 ) + 1900[int(x) is the integer part of x]Month = CMC minus ( ( Year minus 1900 ) * 12 )For more information, visit The DHS Program website to find the latest Guide to DHS Statistics."
# Replace "ym_" with ""
ke_chirts_long <- ke_chirts_long |>
  mutate(CHIRTS_DATE = str_replace(CHIRTS_DATE, "ym_", ""))

ke_chirts_long
## # A tibble: 19,128 × 3
##    ID             CHIRTS_DATE MEAN_TEMP
##    <chr>          <chr>           <dbl>
##  1 KE201400000001 201301           25.5
##  2 KE201400000001 201302           26.9
##  3 KE201400000001 201303           26.7
##  4 KE201400000001 201304           24.0
##  5 KE201400000001 201305           24.7
##  6 KE201400000001 201306           23.2
##  7 KE201400000001 201307           23.4
##  8 KE201400000001 201308           23.0
##  9 KE201400000001 201309           24.5
## 10 KE201400000001 201310           25.1
## # ℹ 19,118 more rows
ke_chirts_long <- ke_chirts_long |>
  mutate(CHIRTS_DATE = ym(CHIRTS_DATE))

ke_chirts_long
## # A tibble: 19,128 × 3
##    ID             CHIRTS_DATE MEAN_TEMP
##    <chr>          <date>          <dbl>
##  1 KE201400000001 2013-01-01       25.5
##  2 KE201400000001 2013-02-01       26.9
##  3 KE201400000001 2013-03-01       26.7
##  4 KE201400000001 2013-04-01       24.0
##  5 KE201400000001 2013-05-01       24.7
##  6 KE201400000001 2013-06-01       23.2
##  7 KE201400000001 2013-07-01       23.4
##  8 KE201400000001 2013-08-01       23.0
##  9 KE201400000001 2013-09-01       24.5
## 10 KE201400000001 2013-10-01       25.1
## # ℹ 19,118 more rows
ke_chirts_long <- ke_chirts_long |>
  mutate(CHIRTS_DATE = (year(CHIRTS_DATE) - 1900) * 12 + month(CHIRTS_DATE))

ke_chirts_long
## # A tibble: 19,128 × 3
##    ID             CHIRTS_DATE MEAN_TEMP
##    <chr>                <dbl>     <dbl>
##  1 KE201400000001        1357      25.5
##  2 KE201400000001        1358      26.9
##  3 KE201400000001        1359      26.7
##  4 KE201400000001        1360      24.0
##  5 KE201400000001        1361      24.7
##  6 KE201400000001        1362      23.2
##  7 KE201400000001        1363      23.4
##  8 KE201400000001        1364      23.0
##  9 KE201400000001        1365      24.5
## 10 KE201400000001        1366      25.1
## # ℹ 19,118 more rows

Workflow note

ke_chirts_long <- ke_chirts_clust |>
  pivot_longer(
    cols = -ID,
    names_to = "CHIRTS_DATE",
    values_to = "MEAN_TEMP"
  ) |>
  mutate(
    CHIRTS_DATE = str_replace(CHIRTS_DATE, "ym_", ""),
    CHIRTS_DATE = ym(CHIRTS_DATE),
    CHIRTS_DATE = (year(CHIRTS_DATE) - 1900) * 12 + month(CHIRTS_DATE)
  )

7.2 Joining data sources

What we have at this point

# Cluster-level monthly CHIRTS data
ke_chirts_long
## # A tibble: 19,128 × 3
##    ID             CHIRTS_DATE MEAN_TEMP
##    <chr>                <dbl>     <dbl>
##  1 KE201400000001        1357      25.5
##  2 KE201400000001        1358      26.9
##  3 KE201400000001        1359      26.7
##  4 KE201400000001        1360      24.0
##  5 KE201400000001        1361      24.7
##  6 KE201400000001        1362      23.2
##  7 KE201400000001        1363      23.4
##  8 KE201400000001        1364      23.0
##  9 KE201400000001        1365      24.5
## 10 KE201400000001        1366      25.1
## # ℹ 19,118 more rows
# Child-level tabular DHS survey data
ke_dhs[, c("DHSID", "KIDID", "KIDDOBCMC", "BIRTHWT")]
## # A tibble: 5,765 × 4
##    DHSID          KIDID              KIDDOBCMC BIRTHWT
##    <chr>          <chr>                  <dbl>   <dbl>
##  1 KE201400000001 40406 0001033 02 1      1364     3.2
##  2 KE201400000001 40406 0001033 02 2      1331     3.5
##  3 KE201400000002 40406 0002051 02 1      1356     3  
##  4 KE201400000002 40406 0002051 02 2      1333     2.3
##  5 KE201400000003 40406 0003023 02 1      1348     3.2
##  6 KE201400000003 40406 0003047 02 1      1352     3.2
##  7 KE201400000003 40406 0003056 02 3      1321     2.8
##  8 KE201400000003 40406 0003072 02 1      1331     4.5
##  9 KE201400000004 40406 0004074 02 1      1351     2  
## 10 KE201400000005 40406 0005011 02 1      1376     3.2
## # ℹ 5,755 more rows

Joining climate and DHS data

inner_join(
  ke_dhs,
  ke_chirts_long
)
## Error in `inner_join()`:
## ! `by` must be supplied when `x` and `y` have no common variables.
## ℹ Use `cross_join()` to perform a cross-join.
ke_dhs_chirts <- inner_join(
  ke_dhs,
  ke_chirts_long,
  by = c("DHSID" = "ID", "KIDDOBCMC" = "CHIRTS_DATE")
)
# Each child is now associated with monthly average temperature for their
# birth month
ke_dhs_chirts |>
  select(DHSID, KIDID, KIDDOBCMC, MEAN_TEMP, BIRTHWT)
## # A tibble: 1,253 × 5
##    DHSID          KIDID              KIDDOBCMC MEAN_TEMP BIRTHWT
##    <chr>          <chr>                  <dbl>     <dbl>   <dbl>
##  1 KE201400000001 40406 0001033 02 1      1364      23.0    3.2 
##  2 KE201400000007 40406 0007056 02 1      1367      28.9    3   
##  3 KE201400000008 40406 0008021 02 1      1363      31.1    3.8 
##  4 KE201400000008 40406 0008052 02 1      1359      33.9    2.95
##  5 KE201400000010 40406 0010108 01 1      1363      28.9    3.26
##  6 KE201400000010 40406 0010108 04 1      1364      28.5    2.3 
##  7 KE201400000012 40406 0012061 02 1      1366      32.2    4.9 
##  8 KE201400000017 40406 0017030 02 1      1364      20.4    3.9 
##  9 KE201400000017 40406 0017046 02 1      1366      22.3    2.5 
## 10 KE201400000018 40406 0018005 02 1      1363      21.5    3.2 
## # ℹ 1,243 more rows

Exploring our joined dataset

ggplot(ke_dhs_chirts) +
  geom_point(aes(x = MEAN_TEMP, y = BIRTHWT), alpha = 0.5, size = 2)
## Warning: Removed 31 rows containing missing values or values outside the scale range
## (`geom_point()`).